This sample consists of data for 150 subjects of the original sample of 522 that has completed the initial battery of 37 cognitive tasks, 23 surveys and 3 surveys on demographics. Details of the original sample as well as quality control (qc) procedures are described elsewhere (Eisenberg et al., 2017). Invited participants were chosen randomly and only subsets of them were invited for a given batch (instead of opening the battery to all qualified subjects) with the intention to avoid a potential oversampling and bias towards “high self regulators”.
workers = read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/Local/User_717570_workers.csv')
workers = workers %>%
group_by(Worker.ID) %>%
mutate(Retest_worker=ifelse(sum(CURRENT.RetestWorker,CURRENT.RetestWorkerB2,CURRENT.RetestWorkerB3,CURRENT.RetestWorkerB4,CURRENT.RetestWorkerB5,na.rm=T)>0,1,0)) %>%
ungroup()
worker_counts <- fromJSON('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/Local/retest_worker_counts.json')
worker_counts = as.data.frame(unlist(worker_counts))
names(worker_counts) = "task_count"
In total 242 participants were invited, 175 began the battery, 157 completed the battery and 150 provided data that passed qc for both time points. Our target sample size was determined in advance of data collection and data collection continued until this number of participants with data that survived qc was reached.
disc_comp_date = read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/Local/discovery_completion_dates.csv', header=FALSE)
val_comp_date = read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/Local/validation_completion_dates.csv', header=FALSE)
test_comp_date = rbind(disc_comp_date, val_comp_date)
rm(disc_comp_date, val_comp_date)
retest_comp_date = read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/Local/retest_completion_dates.csv', header=FALSE)
comp_dates = merge(retest_comp_date, test_comp_date, by="V1")
names(comp_dates) <- c("sub_id", "retest_comp", "test_comp")
comp_dates$retest_comp = as.Date(comp_dates$retest_comp)
comp_dates$test_comp = as.Date(comp_dates$test_comp)
comp_dates$days_btw = with(comp_dates, retest_comp-test_comp)
Data collection took place on average 115 number of days after the completion of the initial battery with a range of 60 to 228 days.
test_demog <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/t1_data/demographic_health.csv')
retest_demog <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/demographic_health.csv')
retest_demog = retest_demog[retest_demog$X %in% test_demog$X,]
names(test_demog)[which(names(test_demog) == 'X')] <-'sub_id'
names(retest_demog)[which(names(retest_demog) == 'X')] <-'sub_id'
summary(retest_demog %>%
select(Sex, Age))
Sex Age
Min. :0.000 Min. :21.0
1st Qu.:0.000 1st Qu.:29.0
Median :1.000 Median :33.0
Mean :0.527 Mean :34.5
3rd Qu.:1.000 3rd Qu.:38.8
Max. :1.000 Max. :60.0
One the major contributions of this project is a comprehensive literature review of the retest reliabilities of the surveys and tasks that were used. We reviewed the literature on a measure (as opposed to task level) paying attention to differences in sample size, the delay between the two measurements as well as the statistic that was used to assess reliabilities (e.g. Spearman vs. Pearson correlations). Here we present a table and a visualization summarizing our findings. References mentioned in the table below can be found here.
lit_review <- read.csv('/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/input/lit_review_figure.csv')
lit_review
Measure level plot
lit_review = lit_review %>%
separate(dv, c("task_group", "var"), sep="\\.",remove=FALSE,extra="merge") %>%
mutate(task_group = factor(task_group, levels = task_group[order(task)]),
type = as.character(type)) %>%
mutate(task_group = gsub("_", " ", task_group),
var = gsub("_", " ", var)) %>%
arrange(task_group, raw_fit, var) %>%
mutate(task_group = ifelse(task_group == "psychological refractory period two choices", "psychological refractory period", ifelse(task_group == "angling risk task always sunny", "angling risk task",task_group))) %>%
mutate(task_group = gsub("survey", "", task_group)) %>%
select(-measure_description, -reference)
Because this plot is difficult to digest we summarize it on a task level to give a general sense of the main takeaways. This plot naturally disregards much of the fine grained information.
p1_t_legend <- lit_review %>%
filter(task == 'task') %>%
ggplot(aes(y = factor(task_group, levels=rev(unique(task_group))), x = retest_reliability)) +
geom_point(aes(size=sample_size, shape = type), color='black')+
theme_bw()+
theme(axis.text.y = element_text(size=43),
legend.position = 'bottom',
axis.text.x = element_text(size=23),
axis.title.x = element_text(size=30),
legend.text = element_text(size=20),
legend.key.width = unit(0.75, "inches"),
legend.title = element_text(size=28),
legend.spacing.x = unit(0.5, "inches")) +
guides(size = guide_legend(override.aes = list(size=c(9,18,28))),
shape = guide_legend(override.aes = list(size=18)))+
xlab("Reliability")+
ylab("")+
scale_x_continuous(limits = c(-0.25,1), breaks=c(-0.25, 0, 0.25, 0.5, 0.75, 1))+
scale_y_discrete(labels = function(x) str_wrap(x, width = 10))+
scale_shape_manual(breaks = sort(lit_review$type), values = c(15, 16, 17, 3), name="Type")+
scale_size_continuous(name = "Sample Size")+
geom_vline(xintercept = 0, color = "red", size = 1)
p1_t <- lit_review %>%
filter(task == 'task') %>%
ggplot(aes(y = factor(task_group, levels=rev(unique(task_group))), x = retest_reliability)) +
geom_point(aes(size=sample_size, shape = type), color='#00BFC4')+
theme_bw()+
theme(axis.text.y = element_text(size=43),
legend.position = 'none',
axis.text.x = element_text(size=23),
axis.title.x = element_text(size=30)) +
xlab("Reliability")+
ylab("")+
scale_x_continuous(limits = c(-0.25,1), breaks=c(-0.25, 0, 0.25, 0.5, 0.75, 1))+
scale_y_discrete(labels = function(x) str_wrap(x, width = 10))+
scale_shape_manual(breaks = sort(lit_review$type), values = c(15, 16, 17, 3))+
scale_size_continuous(range = c(5, 35))+
geom_vline(xintercept = 0, color = "red", size = 1)
p2_t <- lit_review %>%
filter(task == 'survey') %>%
ggplot(aes(y = factor(task_group, levels=rev(unique(task_group))), x = retest_reliability)) +
geom_point(aes(size=sample_size, shape = type), color='#F8766D')+
theme_bw()+
theme(axis.text.y = element_text(size=43),
legend.position = 'none',
axis.text.x = element_text(size=23),
axis.title.x = element_text(size=30)) +
xlab("Reliability")+
ylab("")+
scale_x_continuous(limits = c(-0.25,1), breaks=c(-0.25, 0, 0.25, 0.5, 0.75, 1))+
scale_y_discrete(labels = function(x) str_wrap(x, width = 10))+
scale_shape_manual(breaks = sort(lit_review$type), values = c(15, 16, 17, 3))+
scale_size_continuous(range = c(5, 35))+
geom_vline(xintercept = 0, color = "red", size = 1)
mylegend<-g_legend(p1_t_legend)
p3_t <- arrangeGrob(arrangeGrob(p1_t, p2_t, nrow=1), mylegend, nrow=2,heights=c(10, 1))
ggsave('Lit_Review_Plot_t.jpg', plot = p3_t, device = "jpeg", path = "/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/output/figures/", width = 27, height = 48, units = "in", limitsize = FALSE, dpi = 72)
rm(p1_t, p2_t, p3_t, mylegend, p1_t_legend)
An interactive version of this plot could be find here
Takeaways from this review are: - Survey measures have been reported to higher reliability compared to task measures
- Survey measures have less variability in the reported reliabiltiy estimates compared to task measures’
The variables included in this report are:
- meaningful variables (includes only hdddm parameters)
- EZ diffusion parameters
- Raw RT and Accuracy measures
- Variables found in the literature (for comparison)
test_data <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/t1_data/variables_exhaustive.csv')
test_data <- test_data[,names(test_data) %in% retest_report_vars]
test_data$X <- as.character(test_data$X)
names(test_data)[which(names(test_data) == 'X')] <-'sub_id'
For reference here are the variables that are not included in the analyses of the remainder of this report because they were not of theoretical interest in factor structure analyses of this data so far. These include drift diffusion and other model parameters for specific conditions within a task; survey variables that are not part of the dependant variables for that survey in the literature and demographics (these are saved for prediction analyses).
test_data2 <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/t1_data/variables_exhaustive.csv')
df <- data.frame(names(test_data2)[which(names(test_data2) %in% names(test_data) == FALSE)])
names(df) = c('vars')
df
retest_data <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/variables_exhaustive.csv')
retest_data <- retest_data[,names(retest_data) %in% retest_report_vars]
retest_data$X <- as.character(retest_data$X)
names(retest_data)[which(names(retest_data) == 'X')] <-'sub_id'
retest_data = retest_data[retest_data$sub_id %in% test_data$sub_id,]
Since HDDM parameters depend on the sample on which they are fit we refit the model on t1 data for the subjects that have t2 data. Here we replace the HDDM parameters in the current t1 dataset with these refitted values.
hddm_refits <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_03-21-2017/hddm_refits_exhaustive.csv')
hddm_refits = hddm_refits[,names(hddm_refits) %in% retest_report_vars]
hddm_refits$X <- as.character(hddm_refits$X)
names(hddm_refits)[which(names(hddm_refits) == 'X')] <-'sub_id'
#For later comparison of whether fitting the DDM parameters on full or retest sample makes a big difference
test_data_full_sample_hddm <- test_data
test_data = cbind(test_data$sub_id, test_data[,names(test_data) %in% names(hddm_refits) == FALSE])
names(test_data)[which(names(test_data) == 'test_data$sub_id')] <-'sub_id'
test_data = merge(test_data, hddm_refits, by="sub_id")
Point estimates of reliability for the demographic variabels.
numeric_cols = c()
for(i in 1:length(names(test_demog))){
if(is.numeric(test_demog[,i])){
numeric_cols <- c(numeric_cols, names(test_demog)[i])
}
}
demog_rel_df <- data.frame(spearman = rep(NA, length(numeric_cols)),
icc = rep(NA, length(numeric_cols)),
pearson = rep(NA, length(numeric_cols)))
row.names(demog_rel_df) <- numeric_cols
for(i in 1:length(numeric_cols)){
demog_rel_df[numeric_cols[i], 'spearman'] <- get_spearman(numeric_cols[i], t1_df = test_demog, t2_df = retest_demog)
demog_rel_df[numeric_cols[i], 'icc'] <- get_icc(numeric_cols[i], t1_df = test_demog, t2_df = retest_demog)
demog_rel_df[numeric_cols[i], 'pearson'] <- get_pearson(numeric_cols[i], t1_df = test_demog, t2_df = retest_demog)
}
demog_rel_df %>%
mutate(var = row.names(.)) %>%
select(var, icc, spearman, pearson) %>%
arrange(-icc)
Based on Weir (2005) ICC(3,k) does not take in to account within subject differences between two time points (i.e. the fixed effect of time/systematic error). Thus, it is well approximated by Pearson’s r and subject to similar criticisms. Weir (2005) suggests reporting at least this systematic error effect size if one chooses to report with ICC(3,k). Based on his conclusions here I report:
- ICC(3,k): As Dave clarified this ranges from 1 to -1/(number of repeated measures -1) so in our case this range would be [-1, 1]; larger values would mean that the two scores of a subject for a given measure are more similar to each other than they are to scores of other people
- partial \(\eta^2\) for time (\(SS_{time}/SS_{within}\)): effect size of time
- SEM (\(\sqrt(MS_{error})\)): standard error of measurement; the smaller the better
#Create df of point estimate reliabilities
numeric_cols = c()
for(i in 1:length(names(test_data))){
if(is.numeric(test_data[,i]) & names(test_data)[i] %in% names(retest_data)){
numeric_cols <- c(numeric_cols, names(test_data)[i])
}
}
rel_df <- data.frame(spearman = rep(NA, length(numeric_cols)),
icc = rep(NA, length(numeric_cols)),
pearson = rep(NA, length(numeric_cols)),
eta_sq = rep(NA, length(numeric_cols)),
sem = rep(NA, length(numeric_cols)),
var_subs = rep(NA, length(numeric_cols)),
var_ind = rep(NA, length(numeric_cols)),
var_resid = rep(NA, length(numeric_cols)))
row.names(rel_df) <- numeric_cols
for(i in 1:length(numeric_cols)){
rel_df[numeric_cols[i], 'spearman'] <- get_spearman(numeric_cols[i])
rel_df[numeric_cols[i], 'icc'] <- get_icc(numeric_cols[i])
rel_df[numeric_cols[i], 'pearson'] <- get_pearson(numeric_cols[i])
rel_df[numeric_cols[i], 'eta_sq'] <- get_eta(numeric_cols[i])
rel_df[numeric_cols[i], 'sem'] <- get_sem(numeric_cols[i])
rel_df[numeric_cols[i], 'var_subs'] <- get_var_breakdown(numeric_cols[i])$subs
rel_df[numeric_cols[i], 'var_ind'] <- get_var_breakdown(numeric_cols[i])$ind
rel_df[numeric_cols[i], 'var_resid'] <- get_var_breakdown(numeric_cols[i])$resid
}
rel_df$dv = row.names(rel_df)
row.names(rel_df) = seq(1:nrow(rel_df))
rel_df$task = 'task'
rel_df[grep('survey', rel_df$dv), 'task'] = 'survey'
rel_df[grep('holt', rel_df$dv), 'task'] = "task"
rel_df = rel_df %>%
select(dv, task, spearman, icc, pearson, eta_sq, sem, var_subs, var_ind, var_resid)
# row.names(rel_df) = NULL
Though we are primarily reporting ICC’s as our metric of reliability the results don’t change depending on the metric chosen. Here we plot point estimates of three different reliability metrics against each other (ICC, Pearson, Spearman). The bootstrapped version is essentially the same but the plots are busier due to more datapoints.
p1 = rel_df %>%
ggplot(aes(spearman, icc, col=task))+
geom_point()+
theme_bw()+
theme(legend.title = element_blank(),
legend.position = "none")+
geom_abline(intercept = 0, slope=1)
p2 = rel_df %>%
ggplot(aes(pearson, icc, col=task))+
geom_point()+
theme_bw()+
theme(legend.title = element_blank(),
legend.position = "none")+
geom_abline(intercept = 0, slope=1)
p3 = rel_df %>%
ggplot(aes(pearson, spearman, col=task))+
geom_point()+
theme_bw()+
theme(legend.title = element_blank(),
legend.position = "none")+
geom_abline(intercept = 0, slope=1)
grid.arrange(p1, p2, p3, nrow=1)
ggsave('Metric_Scatterplots.jpg', plot = grid.arrange(p1, p2, p3, nrow=1), device = "jpeg", path = "/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/output/figures/", width = 12, height = 4, units = "in", limitsize = FALSE)
Note: Some variables have <0 ICC’s. This would be the case if the \(MS_{error}\)>\(MS_{between}\). Data for these variables have no relationship between the two time points.
Summarized bootstrapped reliabilities
boot_df <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/bootstrap_merged.csv')
boot_df = boot_df %>%
drop_na() %>%
mutate(icc = as.numeric(as.character(icc)),
spearman = as.numeric(as.character(spearman)),
eta_sq = as.numeric(as.character(eta_sq)),
sem = as.numeric(as.character(sem)))
boot_df = boot_df[boot_df$dv %in% retest_report_vars,]
# retest_report_vars[which(retest_report_vars %in% boot_df$dv==FALSE)]
boot_df %>%
group_by(dv) %>%
summarise(icc_median = quantile(icc, probs = 0.5),
icc_2.5 = quantile(icc, probs = 0.025),
icc_97.5 = quantile(icc, probs = 0.975),
spearman_median = quantile(spearman, probs = 0.5),
spearman_2.5 = quantile(spearman, probs = 0.025),
spearman_97.5 = quantile(spearman, probs = 0.975)) %>%
datatable() %>%
formatRound(columns=c('icc_median', 'icc_2.5', 'icc_97.5', 'spearman_median', 'spearman_2.5', 'spearman_97.5'), digits=3)
# Df wrangling for plotting
tmp = measure_labels %>%
mutate(dv = as.character(dv)) %>%
left_join(boot_df[,c("dv", "icc", "spearman")], by = 'dv')
tmp = tmp %>%
separate(dv, c("task_group", "var"), sep="\\.",remove=FALSE,extra="merge") %>%
mutate(task_group = factor(task_group, levels = task_group[order(task)])) %>%
separate(var, c("var"), sep="\\.",remove=TRUE,extra="drop") %>%
mutate(task_group = gsub("_", " ", task_group),
var = gsub("_", " ", var)) %>%
arrange(task_group, var)
tmp = tmp %>%
left_join(rel_df[,c("dv", "icc")], by = "dv") %>%
rename(icc = icc.x, point_est = icc.y)
#Manual correction
tmp = tmp %>%
mutate(task = ifelse(task_group == 'holt laury survey', "task", as.character(task))) %>%
mutate(task_group = ifelse(task_group == "psychological refractory period two choices", "psychological refractory period", ifelse(task_group == "angling risk task always sunny", "angling risk task",task_group))) %>%
mutate(task_group = gsub("survey", "", task_group))
Variable level summary of bootstrapped reliabilities.
p4 <- tmp %>%
filter(task == 'task',
raw_fit == 'raw') %>%
ggplot(aes(y = var, x = icc)) +
geom_point(color = '#00BFC4')+
geom_point(aes(y = var, x = point_est), color = "black")+
facet_grid(task_group~., switch = "y", scales = "free_y", space = "free_y", labeller = label_wrap_gen(width=20)) +
theme(panel.spacing = unit(0.75, "lines"),
strip.placement = "outside",
strip.text.y = element_text(angle=180, size=36),
axis.text.y = element_text(size=20),
panel.background = element_rect(fill = NA),
panel.grid.major = element_line(colour = "grey80")) +
xlab("")+
ylab("")+
scale_x_continuous(limits = c(-0.25,1), breaks=c(-0.25, 0, 0.25, 0.5, 0.75, 1))+
scale_y_discrete(labels = function(x) str_wrap(x, width = 10))+
geom_vline(xintercept = 0, color = "red", size = 1)
p5 <- tmp %>%
filter(task == 'survey') %>%
ggplot(aes(y = var, x = icc)) +
geom_point(color = '#F8766D')+
geom_point(aes(y = var, x = point_est), color = "black")+
facet_grid(task_group~., switch = "y", scales = "free_y", space = "free_y", labeller = label_wrap_gen(width=20)) +
theme(panel.spacing = unit(0.75, "lines"),
strip.placement = "outside",
strip.text.y = element_text(angle=180, size=36),
axis.text.y = element_text(size=20),
panel.background = element_rect(fill = NA),
panel.grid.major = element_line(colour = "grey80")) +
xlab("")+
ylab("")+
scale_x_continuous(limits = c(-0.25,1), breaks=c(-0.25, 0, 0.25, 0.5, 0.75, 1))+
scale_y_discrete(labels = function(x) str_wrap(x, width = 10))+
geom_vline(xintercept = 0, color = "red", size = 1)
p6 <- arrangeGrob(p4, p5,nrow=1)
ggsave('Bootstrap_Raw_Var_Plot.jpg', plot = p6, device = "jpeg", path = "/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/output/figures/", width = 36, height = 72, units = "in", limitsize = FALSE, dpi=50)
rm(p4, p5, p6)
p4_t <- tmp %>%
filter(task == 'task') %>%
ggplot(aes(x = factor(task_group, levels=rev(unique(task_group))), y = icc)) +
geom_violin(fill='#00BFC4')+
theme_bw() +
theme(axis.text.y = element_text(size=30))+
xlab("")+
ylab("")+
scale_y_continuous(limits = c(-0.25,1), breaks=c(-0.25, 0, 0.25, 0.5, 0.75, 1))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
geom_hline(yintercept = 0, color = "red", size = 1)+
coord_flip()
p5_t <- tmp %>%
filter(task == 'survey') %>%
ggplot(aes(x = factor(task_group, levels=rev(unique(task_group))), y = icc)) +
geom_violin(fill='#F8766D')+
theme_bw() +
theme(axis.text.y = element_text(size=43))+
xlab("")+
ylab("")+
scale_y_continuous(limits = c(-0.25,1), breaks=c(-0.25, 0, 0.25, 0.5, 0.75, 1))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
geom_hline(yintercept = 0, color = "red", size = 1)+
coord_flip()
p6_t <- arrangeGrob(p4_t, p5_t,nrow=1)
ggsave('Bootstrap_Poster_Plot_t.jpg', plot = p6_t, device = "jpeg", path = "../output/figures/", width = 27, height = 48, units = "in", limitsize = FALSE)
Comparison of survey measures to cognitive task measures in the bootstrapped results. Multilevel model with random intercepts for each measure and fixed effect of survey versus cognitive measure.
boot_df = boot_df %>%
mutate(task = ifelse(grepl("survey",dv), "survey","task"),
task = ifelse(grepl("holt",dv), "task", task))
boot_df %>%
group_by(task) %>%
summarise(icc_median = quantile(icc, probs = 0.5),
icc_2.5 = quantile(icc, probs = 0.025),
icc_97.5 = quantile(icc, probs = 0.975),
spearman_median = quantile(spearman, probs = 0.5),
spearman_2.5 = quantile(spearman, probs = 0.025),
spearman_97.5 = quantile(spearman, probs = 0.975),
num_vars = n()/1000) %>%
datatable() %>%
formatRound(columns=c('icc_median', 'icc_2.5', 'icc_97.5', 'spearman_median', 'spearman_2.5', 'spearman_97.5'), digits=3)
summary(lmerTest::lmer(icc ~ task + (1|dv), boot_df))
Linear mixed model fit by REML t-tests use Satterthwaite approximations
to degrees of freedom [lmerMod]
Formula: icc ~ task + (1 | dv)
Data: boot_df
REML criterion at convergence: -917682
Scaled residuals:
Min 1Q Median 3Q Max
-11.241 -0.402 0.024 0.461 7.246
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 0.05214 0.2283
Residual 0.00683 0.0827
Number of obs: 429000, groups: dv, 429
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.8722 0.0265 428.0000 32.9 <2e-16 ***
tasktask -0.3859 0.0292 428.0000 -13.2 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation of Fixed Effects:
(Intr)
tasktask -0.910
boot_df %>%
ggplot(aes(icc, fill = task))+
geom_histogram(alpha=0.5, position='identity')+
theme_bw()+
theme(legend.title = element_blank())+
xlab("ICC")
ggsave('Hist_Boot_Rel.jpg', device = "jpeg", path = "/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/output/figures/", width = 6, height = 4, units = "in", limitsize = FALSE, dpi = 72)
The quantiative explanation for the difference in reliability estimates between surveys and tasks, as recently detailed by Hedge et al. (2017), lies in the difference in sources of variance between these measures. This is because reliability is estimated as a ratio of these sources variance to each other. Specifically, the ICC is calculated as the ratio of variance between subjects variance to all sources of variance. Thus, measures with high between subjects variance would have high test-retest reliability. Intuitively, measures with high between subjects variance are also better suited for individual difference analyses as they would capture the differences between the subjects in a sample.
Here we first plot the percentage of variance explained by the three sources of variance for the point estimates of measure reliabilities. The plot only includes raw measures (no DDM parameters) and the measures are ranked by percentage of between subject variability for each task/survey (i.e. the best to worst individual difference measure for each task/survey). Then we compare statistically whether the percentage of variance explained by these sources differ between tasks and surveys.
tmp = rel_df %>%
mutate(var_subs_pct = var_subs/(var_subs+var_ind+var_resid)*100,
var_ind_pct = var_ind/(var_subs+var_ind+var_resid)*100,
var_resid_pct = var_resid/(var_subs+var_ind+var_resid)*100) %>%
select(dv, task, var_subs_pct, var_ind_pct, var_resid_pct) %>%
mutate(dv = factor(dv, levels = dv[order(task)])) %>%
separate(dv, c("task_group", "var"), sep="\\.",remove=FALSE,extra="merge") %>%
mutate(task_group = factor(task_group, levels = task_group[order(task)])) %>%
arrange(task_group, var_subs_pct) %>%
mutate(rank = row_number()) %>%
arrange(task, task_group, rank) %>%
gather(key, value, -dv, -task_group, -var, -task, -rank) %>%
ungroup()%>%
mutate(task_group = gsub("_", " ", task_group),
var = gsub("_", " ", var)) %>%
mutate(task_group = ifelse(task_group == "psychological refractory period two choices", "psychological refractory period", ifelse(task_group == "angling risk task always sunny", "angling risk task",task_group))) %>%
mutate(task_group = gsub("survey", "", task_group)) %>%
filter(task=="task",
!grepl("EZ|hddm", dv))%>%
arrange(task_group, rank)
labels = tmp %>%
distinct(dv, .keep_all=T)
p1 <- tmp %>%
ggplot(aes(x=factor(rank), y=value, fill=factor(key, levels = c("var_resid_pct", "var_ind_pct", "var_subs_pct"))))+
geom_bar(stat='identity', alpha = 0.75, color='#00BFC4')+
scale_x_discrete(breaks = labels$rank,
labels = labels$var)+
coord_flip()+
facet_grid(task_group~., switch = "y", scales = "free_y", space = "free_y") +
theme(panel.spacing = unit(0.5, "lines"),
strip.placement = "outside",
strip.text.y = element_text(angle=180),
panel.background = element_rect(fill = NA),
panel.grid.major = element_line(colour = "grey85"),
legend.position = 'bottom')+
theme(legend.title = element_blank())+
scale_fill_manual(breaks = c("var_subs_pct", "var_ind_pct", "var_resid_pct"),
labels = c("Variance between individuals",
"Variance between sessions",
"Error variance"),
values=c("grey65", "grey45", "grey25"))+
ylab("")+
xlab("")
tmp = rel_df %>%
mutate(var_subs_pct = var_subs/(var_subs+var_ind+var_resid)*100,
var_ind_pct = var_ind/(var_subs+var_ind+var_resid)*100,
var_resid_pct = var_resid/(var_subs+var_ind+var_resid)*100) %>%
select(dv, task, var_subs_pct, var_ind_pct, var_resid_pct) %>%
mutate(dv = factor(dv, levels = dv[order(task)])) %>%
separate(dv, c("task_group", "var"), sep="\\.",remove=FALSE,extra="merge") %>%
mutate(task_group = factor(task_group, levels = task_group[order(task)])) %>%
arrange(task_group, var_subs_pct) %>%
mutate(rank = row_number()) %>%
arrange(task, task_group, rank) %>%
gather(key, value, -dv, -task_group, -var, -task, -rank) %>%
ungroup()%>%
mutate(task_group = gsub("_", " ", task_group),
var = gsub("_", " ", var)) %>%
mutate(task_group = ifelse(task_group == "psychological refractory period two choices", "psychological refractory period", ifelse(task_group == "angling risk task always sunny", "angling risk task",task_group))) %>%
mutate(task_group = gsub("survey", "", task_group)) %>%
filter(task=="survey")%>%
arrange(task_group, rank)
labels = tmp %>%
distinct(dv, .keep_all=T)
p2 <- tmp %>%
ggplot(aes(x=factor(rank), y=value, fill=factor(key, levels = c("var_resid_pct", "var_ind_pct", "var_subs_pct"))))+
geom_bar(stat='identity', alpha = 0.75)+
geom_bar(stat='identity', color='#F8766D', show.legend=FALSE)+
scale_x_discrete(breaks = labels$rank,
labels = labels$var)+
coord_flip()+
facet_grid(task_group~., switch = "y", scales = "free_y", space = "free_y") +
theme(panel.spacing = unit(0.5, "lines"),
strip.placement = "outside",
strip.text.y = element_text(angle=180),
panel.background = element_rect(fill = NA),
panel.grid.major = element_line(colour = "grey85"),
legend.position = 'bottom')+
theme(legend.title = element_blank())+
scale_fill_manual(breaks = c("var_subs_pct", "var_ind_pct", "var_resid_pct"),
labels = c("Variance between individuals",
"Variance between sessions",
"Error variance"),
values=c("grey65", "grey45", "grey25"))+
ylab("")+
xlab("")
mylegend<-g_legend(p2)
p3 <- arrangeGrob(arrangeGrob(p1 +theme(legend.position="none"),
p2 + theme(legend.position="none"),
nrow=1),
mylegend, nrow=2,heights=c(10, 1))
ggsave('Variance_Breakdown_Plot.jpg', plot = p3, device = "jpeg", path = "/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/output/figures", width = 24, height = 20, units = "in")
rm(tmp, labels)
Comparing types of variance for survey vs task measures: Survey measures have higher between subject variability
Note: This analysis includes DDM variables too.
tmp = boot_df %>%
mutate(var_subs_pct = var_subs/(var_subs+var_ind+var_resid)*100,
var_ind_pct = var_ind/(var_subs+var_ind+var_resid)*100,
var_resid_pct = var_resid/(var_subs+var_ind+var_resid)*100) %>%
select(dv, task, var_subs_pct, var_ind_pct, var_resid_pct) %>%
gather(key, value, -dv, -task)
summary(lmer(value~key*task+(1|dv),tmp))
Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
- Rescale variables?
Linear mixed model fit by REML ['lmerMod']
Formula: value ~ key * task + (1 | dv)
Data: tmp
REML criterion at convergence: 10765111
Scaled residuals:
Min 1Q Median 3Q Max
-3.422 -0.668 -0.085 0.608 4.224
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 0 0.0
Residual 251 15.9
Number of obs: 1287000, groups: dv, 429
Fixed effects:
Estimate Std. Error t value
(Intercept) 9.84e+00 5.83e-02 169
keyvar_resid_pct -7.05e-05 8.24e-02 0
keyvar_subs_pct 7.05e+01 8.24e-02 855
tasktask 1.34e+01 6.41e-02 210
keyvar_resid_pct:tasktask 1.26e+00 9.06e-02 14
keyvar_subs_pct:tasktask -4.15e+01 9.06e-02 -458
Correlation of Fixed Effects:
(Intr) kyvr_r_ kyvr_s_ tsktsk kyvr_r_:
kyvr_rsd_pc -0.707
kyvr_sbs_pc -0.707 0.500
tasktask -0.910 0.643 0.643
kyvr_rsd_p: 0.643 -0.910 -0.455 -0.707
kyvr_sbs_p: 0.643 -0.455 -0.910 -0.707 0.500
convergence code: 0
Model is nearly unidentifiable: very large eigenvalue
- Rescale variables?
aggregate(value ~task+key, FUN=mean, data=tmp)
tmp%>%
group_by(task, key) %>%
summarise(median = median(value),
sd = sd(value))
Summarizing for clearer presentation. This grap is currently using the bootstrapped reliabilities and is therefore messier than if just using the point estimates.
tmp %>%
group_by(task, key) %>%
ggplot(aes(factor(task, levels=c("task","survey"), labels=c("Task", "Survey")), value, fill=key, color=task))+
geom_boxplot()+
theme_bw()+
theme(legend.title = element_blank())+
scale_fill_manual(breaks = c("var_subs_pct", "var_ind_pct", "var_resid_pct"),
labels = c("Variance between individuals",
"Variance between sessions",
"Error variance"),
values=c("grey65", "grey45", "grey25"))+
scale_color_discrete(guide = FALSE)+
xlab("")+
ylab("Percent")
ggsave('Variance_Breakdown_Plot_Summary.jpg', device = "jpeg", path = "/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/output/figures", width = 10, height = 5, units = "in")
Here we summarize the results on a task level to make it more digestable and easier to make contact with the literature.
We reduce the list of task measures to a list of one per task by averaging only the raw measures from all the trials in a task. We chose to reduce the information in this manner to avoid any bias stemming from differential amount of interest and procedures applied to certain tasks over others (e.g. a task can have over 10 measures because it has multiple conditions and we have chosen to fit DDM’s for specific conditions while another might only have 2 due to our relative inexperience and lack of interest in it). We check whether the number of trials in a task has a significant effect on these average reliabilities of raw measures as well.
We filter out the DDM parameters and measures for specific contrasts. Note that this does leave some tasks with measures that are model fits and/or for specific conditions (because at least the current datasets do not include measures that are based on all the trials and are raw though I could dive in to variables_exhaustive for such measures. For example the average relialibility for Kirby is based on three discount rates for specific conditions.). Here’s the order of tasks by mean reliability sorted for ICC and then Spearman’s \(\rho\).
tmp = measure_labels %>%
mutate(dv = as.character(dv)) %>%
filter(task == 'task') %>%
left_join(boot_df[,c("dv", "icc", "spearman")], by = 'dv') %>%
filter(overall_difference != 'difference' & raw_fit %in% c('EZ', 'hddm') == FALSE)%>%
separate(dv, c('task_name', 'extra_1', 'extra_2'), sep = '\\.',remove=FALSE) %>%
select(-extra_1, -extra_2) %>%
group_by(task_name) %>%
summarise(median_icc = median(icc),
median_spearman = median(spearman),
min_icc = min(icc),
max_icc = max(icc),
min_spearman = min(spearman),
max_spearman = max(spearman),
num_measures = n(),
num_trials = unique(num_all_trials))%>%
arrange(-median_icc, -median_spearman)
tmp %>%
datatable() %>%
formatRound(columns=c('median_spearman', 'median_icc',
'min_spearman', 'min_icc',
'max_spearman', 'max_icc'), digits=3)
Does number of items in a task have a significant effect on the average ICC of (mostly) raw measures for all trials from a task? No. (no effect on Spearman either)
tmp = measure_labels %>%
mutate(dv = as.character(dv)) %>%
filter(task == 'task') %>%
# left_join(rel_df[,c("dv", "spearman","icc")], by='dv') %>%
left_join(boot_df[,c("dv", "spearman","icc")], by='dv') %>%
filter(overall_difference != 'difference' & raw_fit %in% c('EZ', 'hddm') == FALSE)%>%
separate(dv, c('task_name', 'extra_1', 'extra_2'), sep = '\\.',remove=FALSE) %>%
select(-extra_1, -extra_2)
# summary(lm(icc ~ num_all_trials, data = tmp))
summary(lmer(icc ~ num_all_trials + (1|dv), data = tmp))
Linear mixed model fit by REML ['lmerMod']
Formula: icc ~ num_all_trials + (1 | dv)
Data: tmp
REML criterion at convergence: -380286
Scaled residuals:
Min 1Q Median 3Q Max
-11.439 -0.462 0.037 0.523 7.564
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 0.04299 0.2073
Residual 0.00555 0.0745
Number of obs: 162000, groups: dv, 162
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.5922642 0.0231072 25.63
num_all_trials -0.0000518 0.0001206 -0.43
Correlation of Fixed Effects:
(Intr)
num_ll_trls -0.709
measure_labels %>%
mutate(dv = as.character(dv)) %>%
filter(task == 'task') %>%
# left_join(rel_df[,c("dv", "spearman","icc")], by='dv') %>%
left_join(boot_df[,c("dv", "spearman","icc")], by='dv') %>%
filter(overall_difference != 'difference' & raw_fit %in% c('EZ', 'hddm') == FALSE)%>%
separate(dv, c('task_name', 'extra_1', 'extra_2'), sep = '\\.',remove=FALSE) %>%
select(-extra_1, -extra_2) %>%
ggplot(aes(num_all_trials, icc))+
geom_point()+
geom_smooth(method="lm")+
theme_bw()+
xlab("Number of trials")+
ylab("ICC")
The above analysis was looking at the effect of number of trials across tasks. But some tasks might be bad for individual difference measurement regardless of how many trials there are in them whereas for others fewer trials might be yielding a sufficiently reliable measure.
For tasks for which dependent variables are estimated using many trials one can ask: Does the same measure get less reliable if fewer trials are used to estimate its reliability?
This won’t make sense for all tasks. For example to estimate a risk aversion parameter you need all trials for Holt and Laury. For Kirby and Bickel you have specific conditions looking at fewer trials. The Cognitive Reflection Task might be more appropriate to analyze each item seaprately. The writing task does not have trial numbers. For all others it might be interesting to investigate.
That said this kind of analysis seems too in the weeds for a paper that is trying to give a global sense of the differences between self-regulation measures in their suitablity for individual difference analyses based on their stability across time. Such analyses would provide a detailed examination of how to extract the most reliable/best individual difference measure from tasks with a set of mediocre variables to begin with. So I am choosing to ignore this for now but this can be revisited.
Checking DDM results in the bootstrapped estimates. Variables using all trials are significantly more reliable compared to difference scores. Raw measures don’t differ from DDM parameters. Which DDM is better depends on whether all trials are used.
tmp = measure_labels %>%
mutate(dv = as.character(dv)) %>%
filter(ddm_task == 1,
overall_difference != "condition") %>%
drop_na() %>%
left_join(boot_df[,c("dv", "icc", "spearman")], by = 'dv')
tmp %>%
group_by(overall_difference, raw_fit, rt_acc) %>%
summarise(icc_median = quantile(icc, probs = 0.5),
icc_2.5 = quantile(icc, probs = 0.025),
icc_97.5 = quantile(icc, probs = 0.975),
spearman_median = quantile(spearman, probs = 0.5),
spearman_2.5 = quantile(spearman, probs = 0.025),
spearman_97.5 = quantile(spearman, probs = 0.975),
num_vars = n()/1000) %>%
datatable() %>%
formatRound(columns=c('icc_median', 'icc_2.5', 'icc_97.5', 'spearman_median', 'spearman_2.5', 'spearman_97.5'), digits=3)
summary(lmerTest::lmer(icc ~ overall_difference*raw_fit + (1|dv) ,tmp))
Linear mixed model fit by REML t-tests use Satterthwaite approximations
to degrees of freedom [lmerMod]
Formula: icc ~ overall_difference * raw_fit + (1 | dv)
Data: tmp
REML criterion at convergence: -401006
Scaled residuals:
Min 1Q Median 3Q Max
-9.633 -0.465 0.039 0.513 5.470
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 0.0326 0.1806
Residual 0.0093 0.0965
Number of obs: 219000, groups: dv, 219
Fixed effects:
Estimate Std. Error df t value
(Intercept) 0.1986 0.0261 213.8000 7.62
overall_differenceoverall 0.4507 0.0382 213.8000 11.81
raw_fithddm 0.1268 0.0521 213.8000 2.43
raw_fitraw 0.0761 0.0389 213.8000 1.96
overall_differenceoverall:raw_fithddm -0.2198 0.0654 213.8000 -3.36
overall_differenceoverall:raw_fitraw -0.0469 0.0575 213.8000 -0.82
Pr(>|t|)
(Intercept) 8.2e-13 ***
overall_differenceoverall < 2e-16 ***
raw_fithddm 0.01586 *
raw_fitraw 0.05181 .
overall_differenceoverall:raw_fithddm 0.00091 ***
overall_differenceoverall:raw_fitraw 0.41556
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation of Fixed Effects:
(Intr) ovrll_ rw_fth rw_ftr
ovrll_dffrn -0.683
raw_fithddm -0.500 0.342
raw_fitraw -0.670 0.457 0.335
ovrll_dffrncvrll:rw_fth 0.399 -0.584 -0.798 -0.267
ovrll_dffrncvrll:rw_ftr 0.453 -0.663 -0.226 -0.677
ovrll_dffrncvrll:rw_fth
ovrll_dffrn
raw_fithddm
raw_fitraw
ovrll_dffrncvrll:rw_fth
ovrll_dffrncvrll:rw_ftr 0.387
tmp %>%
ggplot(aes(factor(raw_fit, levels = c("raw", "EZ", "hddm"), labels=c("Raw", "EZ-diffusion", "Hierarchical diffusion")), icc, fill=factor(rt_acc, levels = c("rt","accuracy","other", "drift rate", "threshold", "non-decision"), labels=c("Response Time", "Accuracy", "Other","Drift Rate", "Threshold", "Non-decision"))))+
geom_boxplot()+
facet_wrap(~factor(overall_difference, levels=c("overall", "difference"), labels=c("Overall", "Difference")))+
theme_bw()+
ylab("ICC")+
xlab("")+
theme(legend.title = element_blank())
ggsave('Bootstrap_DDM_Comp.jpg', device = "jpeg", path = "/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/output/figures/", width = 24, height = 6, units = "in", limitsize = FALSE)
What does the variance breakdown look like for DDM variables separately?
tmp = rel_df %>%
mutate(var_subs_pct = var_subs/(var_subs+var_ind+var_resid)*100,
var_ind_pct = var_ind/(var_subs+var_ind+var_resid)*100,
var_resid_pct = var_resid/(var_subs+var_ind+var_resid)*100) %>%
select(dv, task, var_subs_pct, var_ind_pct, var_resid_pct) %>%
mutate(dv = factor(dv, levels = dv[order(task)])) %>%
separate(dv, c("task_group", "var"), sep="\\.",remove=FALSE,extra="merge") %>%
mutate(task_group = factor(task_group, levels = task_group[order(task)])) %>%
arrange(task_group, var_subs_pct) %>%
mutate(rank = row_number()) %>%
arrange(task, task_group, rank) %>%
gather(key, value, -dv, -task_group, -var, -task, -rank) %>%
ungroup()%>%
mutate(task_group = gsub("_", " ", task_group),
var = gsub("_", " ", var)) %>%
mutate(task_group = ifelse(task_group == "psychological refractory period two choices", "psychological refractory period", ifelse(task_group == "angling risk task always sunny", "angling risk task",task_group))) %>%
mutate(task_group = gsub("survey", "", task_group)) %>%
filter(task=="task",
grepl("EZ|hddm", dv))%>%
arrange(task_group, rank)
labels = tmp %>%
distinct(dv, .keep_all=T)
p1 <- tmp %>%
ggplot(aes(x=factor(rank), y=value, fill=factor(key, levels = c("var_resid_pct", "var_ind_pct", "var_subs_pct"))))+
geom_bar(stat='identity', alpha = 0.75, color='#00BFC4')+
scale_x_discrete(breaks = labels$rank,
labels = labels$var)+
coord_flip()+
facet_grid(task_group~., switch = "y", scales = "free_y", space = "free_y") +
theme(panel.spacing = unit(0.5, "lines"),
strip.placement = "outside",
strip.text.y = element_text(angle=180),
panel.background = element_rect(fill = NA),
panel.grid.major = element_line(colour = "grey85"),
legend.position = 'bottom')+
theme(legend.title = element_blank())+
scale_fill_manual(breaks = c("var_subs_pct", "var_ind_pct", "var_resid_pct"),
labels = c("Variance between individuals",
"Variance between sessions",
"Error variance"),
values=c("grey65", "grey45", "grey25"))+
ylab("")+
xlab("")
ggsave('Variance_Breakdown_Plot_DDM.jpg', plot = p1, device = "jpeg", path = "/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/output/figures", width = 12, height = 20, units = "in")
Summarizing it by raw vs fit and for each parameter
tmp = measure_labels %>%
mutate(dv = as.character(dv)) %>%
filter(ddm_task == 1,
overall_difference != "condition") %>%
drop_na() %>%
left_join(boot_df[,c("dv", "icc", "var_subs", "var_ind", "var_resid")], by = 'dv') %>%
mutate(var_subs_pct = (var_subs/(var_subs+var_ind+var_resid))*100,
var_ind_pct= (var_ind/(var_subs+var_ind+var_resid))*100,
var_resid_pct = (var_resid/(var_subs+var_ind+var_resid))*100) %>%
select(-task, -ddm_task, -num_all_trials, -var_subs, -var_ind, -var_resid) %>%
gather(key, value, -dv, -overall_difference, -raw_fit, -rt_acc, -icc)
tmp %>%
ggplot(aes(factor(raw_fit, levels = c("raw", "EZ", "hddm"), labels=c("Raw", "EZ-diffusion", "Hierarchical diffusion")), value, fill=factor(rt_acc, levels = c("rt","accuracy","other", "drift rate", "threshold", "non-decision"), labels=c("Response Time", "Accuracy", "Other","Drift Rate", "Threshold", "Non-decision"))))+
geom_boxplot()+
facet_grid(factor(key, levels = c("var_subs_pct", "var_ind_pct", "var_resid_pct"), labels=c("Variance between subjects", "Variance between individuals", "Error variance"))~factor(overall_difference, levels=c("overall", "difference"), labels=c("Overall", "Difference")))+
theme_bw()+
ylab("% of total variance")+
xlab("")+
theme(legend.title = element_blank())
ggsave('Variance_Breakdown_Plot_DDM_summary.jpg', device = "jpeg", path = "/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/output/figures", width = 10, height = 8, units = "in")
Do these differ statistically?
The plot does not suggest so but this model is giving a warning and very high t values. How else should I be checking for differences between these distributions?
summary(lmer(value~key*overall_difference*raw_fit+(1|dv), tmp))
Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
- Rescale variables?
Linear mixed model fit by REML ['lmerMod']
Formula: value ~ key * overall_difference * raw_fit + (1 | dv)
Data: tmp
REML criterion at convergence: 5546086
Scaled residuals:
Min 1Q Median 3Q Max
-3.134 -0.708 -0.035 0.629 4.015
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 0 0.0
Residual 271 16.5
Number of obs: 657000, groups: dv, 219
Fixed effects:
Estimate Std. Error
(Intercept) 21.6705 0.0752
keyvar_resid_pct 12.2911 0.1063
keyvar_subs_pct 22.6975 0.1063
overall_differenceoverall 5.7338 0.1101
raw_fithddm 10.3546 0.1504
raw_fitraw 2.3821 0.1123
keyvar_resid_pct:overall_differenceoverall -21.2319 0.1557
keyvar_subs_pct:overall_differenceoverall 4.0303 0.1557
keyvar_resid_pct:raw_fithddm -17.6249 0.2127
keyvar_subs_pct:raw_fithddm -13.4388 0.2127
keyvar_resid_pct:raw_fitraw -5.1278 0.1588
keyvar_subs_pct:raw_fitraw -2.0184 0.1588
overall_differenceoverall:raw_fithddm -10.5695 0.1885
overall_differenceoverall:raw_fitraw -5.1719 0.1660
keyvar_resid_pct:overall_differenceoverall:raw_fithddm 21.0070 0.2666
keyvar_subs_pct:overall_differenceoverall:raw_fithddm 10.7014 0.2666
keyvar_resid_pct:overall_differenceoverall:raw_fitraw 6.6271 0.2348
keyvar_subs_pct:overall_differenceoverall:raw_fitraw 8.8886 0.2348
t value
(Intercept) 288.2
keyvar_resid_pct 115.6
keyvar_subs_pct 213.4
overall_differenceoverall 52.1
raw_fithddm 68.9
raw_fitraw 21.2
keyvar_resid_pct:overall_differenceoverall -136.4
keyvar_subs_pct:overall_differenceoverall 25.9
keyvar_resid_pct:raw_fithddm -82.9
keyvar_subs_pct:raw_fithddm -63.2
keyvar_resid_pct:raw_fitraw -32.3
keyvar_subs_pct:raw_fitraw -12.7
overall_differenceoverall:raw_fithddm -56.1
overall_differenceoverall:raw_fitraw -31.2
keyvar_resid_pct:overall_differenceoverall:raw_fithddm 78.8
keyvar_subs_pct:overall_differenceoverall:raw_fithddm 40.1
keyvar_resid_pct:overall_differenceoverall:raw_fitraw 28.2
keyvar_subs_pct:overall_differenceoverall:raw_fitraw 37.9
Correlation matrix not shown by default, as p = 18 > 12.
Use print(x, correlation=TRUE) or
vcov(x) if you need it
convergence code: 0
Model is nearly unidentifiable: very large eigenvalue
- Rescale variables?
How much do parameter values change depending on whether the model was fit using the whole sample vs only the retest sample. Each point in these graphs is a subject’s parameter estimates. There is almost no change for any of the EZ parameters (though shouldn’t this have been 0?). For the HDDM parameters most of the changes are happening for drift rates but they don’t appear to be systematically higher or lower.
I’m not entirely sure what to make of this. A systematic change would have meant that the ranking remained the same regardless of the sample size. When the change is non-systematic in this way then a given subject that has high higher drift rates than most of the people in the full sample can appear to have lower drift rates than the same people when fit on the retest sample only. (note that a t-test is not significant)
Of course these parameter estimates themselves are not very interpretable in their ‘goodness’ as they don’t reflect anything about model fits. What is the measure of fits here? The variance of the posterior?
hddm_fullfits <- test_data_full_sample_hddm[,names(test_data_full_sample_hddm) %in% names(hddm_refits)]
hddm_fullfits = hddm_fullfits %>%
gather(dv, value, -sub_id)
hddm_refits %>%
gather(dv, value, -sub_id) %>%
left_join(hddm_fullfits, by = c("sub_id", "dv")) %>%
rename(fullfit = value.y, refit = value.x) %>%
left_join(measure_labels[,c("dv", "raw_fit", "rt_acc")], by="dv") %>%
filter(rt_acc %in% c("drift rate","non-decision", "threshold")) %>%
ggplot(aes(fullfit, refit))+
geom_point(aes(color=rt_acc))+
facet_wrap(~raw_fit)+
theme_bw()+
geom_abline(slope=1, intercept=0)+
xlab("Fit on full sample")+
ylab("Fit on retest sample")+
theme(legend.title = element_blank())
There doesn’t seem to be an effect looking at the graph but the multilevel model seems to show a very significant but tiny effect.
tmp <- read.csv('/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/input/all_ddm_sample_size_summary.csv')
ggplotly(tmp %>%
select(dv, N, icc_median) %>%
left_join(measure_labels[,c("dv", "raw_fit", "rt_acc")], by="dv") %>%
ggplot(aes(N, icc_median, color=dv))+
geom_line()+
theme_bw()+
theme(legend.position = 'none')+
xlab("Sample Size")+
ylab("Median ICC"))
summary(lmer(icc_median ~ N + (1|dv), tmp))
Linear mixed model fit by REML ['lmerMod']
Formula: icc_median ~ N + (1 | dv)
Data: tmp
REML criterion at convergence: -9249
Scaled residuals:
Min 1Q Median 3Q Max
-10.117 -0.231 0.014 0.215 6.584
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 0.071491 0.2674
Residual 0.000539 0.0232
Number of obs: 2218, groups: dv, 148
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.4448702 0.0220029 20.22
N -0.0000909 0.0000114 -7.96
Correlation of Fixed Effects:
(Intr)
N -0.042
TBD. Still need to figure out how to turn off the H.
Do people differ in how much their scores change depending on how many days it has been since they completed the initial battery?
Make data frame with difference between two scores for each measure for each subject. Since the scores for different measures are on different scales for comparability the difference scores are scaled (i.e. divided by their root mean square) but not centered so a value of 0 for the difference scores would indicate a lack of a difference between the scores of a subject.
t1_2_difference = data.frame()
for(i in 1:length(numeric_cols)){
tmp = match_t1_t2(numeric_cols[i],format='wide')
tmp = tmp %>%
# mutate(difference = scale(`2` - `1`))
mutate(difference = scale(`2` - `1`, center=F))
t1_2_difference = rbind(t1_2_difference, tmp)
}
t1_2_difference$difference = as.data.frame(t1_2_difference$difference)$V1
Add completion dates to this data frame.
t1_2_difference = merge(t1_2_difference, comp_dates[,c('sub_id', 'days_btw')], by='sub_id')
The effect of number of days in between in the full model where the difference scores are regressed on a fixed effect for measure and days between the two scores and random intercepts for each subject (Should this model include the interaction between the fixed effects?). Since there are over 400 measures the output of the full model is not presented here. Instead below are the coefficient for the fixed effect of days between completion times and its t value.
mod = lmer(difference ~ dv + days_btw + (1|sub_id), data = t1_2_difference)
data.frame(estimate=coef(summary(mod))["days_btw","Estimate"], tval=coef(summary(mod))["days_btw","t value"])
For visualization purposes I summarized the difference scores per person by looking at the average difference and plot that against the number of days between completion.
tmp = t1_2_difference %>%
group_by(sub_id) %>%
summarise(mean_diff = mean(difference, na.rm=T),
days_btw = unique(days_btw))
tmp %>%
ggplot(aes(days_btw, mean_diff))+
geom_point()+
theme_bw()+
xlab("Days between completion")+
ylab("Mean standardized difference between two time points")+
geom_smooth(method="lm")
To confirm: the slope of this line is not significant. That is, there doesn’t seem to be a systematic difference between the two measurements depending on the number of days between the two measurements.
summary(lm(mean_diff ~ days_btw, data=tmp))
Call:
lm(formula = mean_diff ~ days_btw, data = tmp)
Residuals:
Min 1Q Median 3Q Max
-0.30567 -0.06283 0.00471 0.06969 0.23471
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.059770 0.032602 -1.83 0.069 .
days_btw 0.000546 0.000276 1.98 0.050 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.0993 on 148 degrees of freedom
Multiple R-squared: 0.0258, Adjusted R-squared: 0.0192
F-statistic: 3.92 on 1 and 148 DF, p-value: 0.0496
For each variable does the average difference between two time points vary by the average number of days between the two time points? No. The slopes of these lines or the interaction is not significant.
t1_2_difference <- t1_2_difference %>%
separate(dv, c('task_name', 'extra_1', 'extra_2'), sep = '\\.',remove=FALSE) %>%
select(-extra_1, -extra_2) %>%
mutate(task = ifelse(grepl("survey",dv), "survey","task"),
task = ifelse(grepl("holt",dv), "task", task))
# summary(lmer(difference ~ days_btw + (task_name|sub_id), t1_2_difference))
t1_2_difference %>%
mutate(task = ifelse(grepl("survey",dv), "survey","task"),
task = ifelse(grepl("holt",dv), "task", task)) %>%
group_by(dv) %>%
summarise(mean_diff = mean(difference),
mean_days_btw = mean(days_btw),
task = unique(task)) %>%
ggplot(aes(mean_days_btw, mean_diff, color=task))+
geom_point()+
theme_bw()+
geom_smooth(method="lm")+
xlab("Average days between completion")+
ylab("Average difference between time points")+
theme(legend.title = element_blank())
tmp = t1_2_difference %>%
group_by(dv) %>%
summarise(mean_diff = mean(difference),
mean_days_btw = mean(days_btw),
task = unique(task))
summary(lm(mean_diff ~ mean_days_btw*task, tmp))
Call:
lm(formula = mean_diff ~ mean_days_btw * task, data = tmp)
Residuals:
Min 1Q Median 3Q Max
-0.3866 -0.0735 0.0029 0.0747 0.3361
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.1270 5.7888 0.37 0.71
mean_days_btw -0.0187 0.0506 -0.37 0.71
tasktask -4.0722 5.8832 -0.69 0.49
mean_days_btw:tasktask 0.0358 0.0514 0.70 0.49
Residual standard error: 0.117 on 425 degrees of freedom
Multiple R-squared: 0.0164, Adjusted R-squared: 0.00945
F-statistic: 2.36 on 3 and 425 DF, p-value: 0.0708
Is the t2 score higher for task variables? Plotting the average difference between time points (for all subjects) for each dependent variable coloring by task. More positive on the y-axis means more improvement. Didn’t want to summarize it at task level because not all measures might be things we’d expect learning effects on.
It’s hard to detect any meaningful global changes/improvements from this graph. It seems that for each task there might be some variables that have improved at t2 but whether these are menaingful learning effects should be determined based on the variable.
t1_2_difference %>%
filter(task == "task") %>%
group_by(dv) %>%
summarise(mean_diff = mean(difference),
task_name = unique(task_name)) %>%
ggplot(aes(dv,mean_diff, color=task_name))+
geom_point()+
theme_bw()+
theme(legend.position = "none",
axis.text.x = element_blank())+
xlab("Dependent variable")+
ylab("Average difference between time points")
All results reported above were for non-transformed dependent variables.
We could compare the reliability (point) estimates for meaningful variables that have been transformed in the full dataset and transformed the same way in the retest dataset as well.
test_data_tmp <- test_data
retest_data_tmp <- retest_data
test_data <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/t1_data/meaningful_variables_clean.csv')
test_data$X <- as.character(test_data$X)
names(test_data)[which(names(test_data) == 'X')] <-'sub_id'
retest_data <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_09-27-2017/meaningful_variables_clean.csv')
retest_data$X <- as.character(retest_data$X)
names(retest_data)[which(names(retest_data) == 'X')] <-'sub_id'
retest_data = retest_data[retest_data$sub_id %in% test_data$sub_id,]
numeric_cols_mngfl = c()
for(i in 1:length(names(test_data))){
if(is.numeric(test_data[,i]) & names(test_data)[i] %in% names(retest_data)){
numeric_cols_mngfl <- c(numeric_cols_mngfl, names(test_data)[i])
}
}
rel_df_mngfl <- data.frame(icc = rep(NA, length(numeric_cols_mngfl)))
row.names(rel_df_mngfl) <- numeric_cols_mngfl
for(i in 1:length(numeric_cols_mngfl)){
rel_df_mngfl[numeric_cols_mngfl[i], 'icc'] <- get_icc(numeric_cols_mngfl[i])
}
rel_df_mngfl$dv = row.names(rel_df_mngfl)
row.names(rel_df_mngfl) = seq(1:nrow(rel_df_mngfl))
rel_df_mngfl$task = 'task'
rel_df_mngfl[grep('survey', rel_df_mngfl$dv), 'task'] = 'survey'
rel_df_mngfl[grep('holt', rel_df_mngfl$dv), 'task'] = "task"
rel_df_mngfl
ggplotly(rel_df_mngfl %>%
filter(grepl("logTr", dv)) %>%
mutate(dv = gsub(".ReflogTr", "", dv),
dv = gsub(".logTr", "", dv)) %>%
left_join(rel_df[,c("dv", "icc")], by="dv") %>%
ggplot(aes(icc.x, icc.y, color=task, label=dv))+
geom_point()+
theme_bw()+
xlab("Reliability for transformed variable")+
ylab("Reliability for raw variable")+
geom_abline(slope=1, intercept=0),
label=dv)